package com.ming.zhihuWebSpider.process;

import com.ming.zhihuWebSpider.pipeline.DouyuPipeline;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;

/**
 * 斗鱼
 *  @author xxd
 *  @date: 2020/05/06
 */
@Component
public class DouyuProcessor implements PageProcessor{

	static ApplicationContext applicationContext = new ClassPathXmlApplicationContext("classpath*:/config/spring-*.xml");

	@Autowired
	private DouyuPipeline douyuPipeline;

	// 首页
	private static final String START_URL  =  "https://www.douyu.com$";

	// 列表页
	private static final String URL_LIST = "https://www\\.douyu\\.com/\\w_\\w+";

	// 详情页
	private static final String URL_DETAIL = "https://www\\.douyu\\.com/[1-9]\\d*$";

	private Site site = Site.me().setCycleRetryTimes(5).setRetryTimes(5).setSleepTime(300).setTimeOut(3 * 60 * 1000)
			.setUserAgent(
                    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");


	
	@Override
	public void process(Page page) {

		// 首页 || 列表页
		if (page.getUrl().regex(START_URL).match() || page.getUrl().regex(URL_LIST).match()) {
			// List<String> all = page.getHtml().links().regex(URL_DETAIL).all();
			page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all());
			page.addTargetRequests(page.getHtml().links().regex(URL_DETAIL).all());
		} else if(page.getUrl().regex(URL_DETAIL).match()){
		    // 详情页
			page.addTargetRequests(page.getHtml().links().regex(URL_DETAIL).all());
			String title = page.getHtml().xpath("//*[@id=\"js-player-title\"]/div/div/div/div/div/h3/text()").get();
			if (title == null) {
				//skip this page
				page.setSkip(true);
			}

			String name = page.getHtml().xpath("//*[@id=\"js-player-title\"]/div/div/div/div/div/h2/text()").get();
			String type = page.getHtml().xpath("/html/head/title/text()").get();
			String[] types = null;
			if(type != null){
				types = type.split("_");
				if(types.length <= 1){
					page.putField("type",types[0]);
				}else{
					page.putField("type",types[1]);
				}
			}
			page.putField("title", title);
			page.putField("name", name);
			page.putField("url",page.getUrl().toString());

			System.out.println(title + " " + name + " " + type);
		}

		// 获取外面标题
		//List<String> titles = page.getHtml().xpath("//*[@id=\"listAll\"]/div/ul/li/div/a/div/div/h3/text()").all();
	}

	@Override
	public Site getSite() {
		return site;
	}

	public void crawl() {
		Spider.create(new DouyuProcessor()).addPipeline(douyuPipeline).addUrl("https://www.douyu.com").thread(3).run();
	}

	public static void main(String[] args) {

		applicationContext.getBean(DouyuProcessor.class).crawl();
	 }

}
